VACUUM

VACUUM

VISUAL AUDIO COMPARISION UTILITY [FOR] UNDERSTANDING [AND] MEASUREMENT
A testing and analysis workflow

Imports

In [78]:
# check environment
import sys
print("Path: {}".format(sys.path))
print("Executable: {}".format(sys.executable))
Path: ['/Volumes/MACSTORE/Users/benjamin/dev/cs499/visualMIDIcompare', '/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python37.zip', '/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7', '/usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/lib/python3.7/lib-dynload', '', '/Volumes/MACSTORE/Users/benjamin/dev/cs499/visualMIDIcompare/venv/lib/python3.7/site-packages', '/Volumes/MACSTORE/Users/benjamin/dev/cs499/visualMIDIcompare/venv/lib/python3.7/site-packages/IPython/extensions', '/Volumes/MACSTORE/Users/benjamin/.ipython']
Executable: /Volumes/MACSTORE/Users/benjamin/dev/cs499/visualMIDIcompare/venv/bin/python3
In [79]:
from __future__ import print_function
In [80]:
# Librosa Imports
import librosa
import librosa.display
import IPython.display
import numpy as np
import scipy
In [81]:
# Matplotlib Imports
import matplotlib.pyplot as plt
import matplotlib.style as ms
ms.use('seaborn-muted')
%matplotlib inline

Let's bring the files in

In [181]:
# Source1 Audio Path
# source1_audio_fn = 'source1.wav'
# source1_audio_fn = 'tuneBuildTest.wav'
# source1_audio_fn = 'Chopin1.wav'
source1_audio_fn = 'Yundi_2017.wav'
source1_audio_path = 'source/' + source1_audio_fn

# Source2 Audio Path
# source2_audio_fn = 'source2.wav'
# source2_audio_fn = 'tuneBuildTest2.wav'
# source2_audio_fn = 'Chopin2.wav'
source2_audio_fn = 'Rosenthal_1930.wav'
source2_audio_path = 'source/' + source2_audio_fn

Source1 Track

Open Source1, get some basic statistics and create a player

In [83]:
# Load source1 track
y, sr = librosa.load(source1_audio_path)

# Get track duration
source1_duration = librosa.get_duration(y=y, sr=sr)

# Get a tuning estimate
# estimate_tuning: Estimate the tuning of an audio time series or spectrogram input
source1_tuning = librosa.estimate_tuning(y=y, sr=sr)

print("File: {} \nDuration: {:2.4f} sec".format(source1_audio_path, source1_duration))
print("Tuning estimate: {}".format(source1_tuning))

# Play back original track
IPython.display.Audio(data=y, rate=sr)
File: source/Yundi_2017.wav 
Duration: 272.4534 sec
Tuning estimate: 0.08999999999999997
Out[83]:

Let's take a first look at the file

In [84]:
# mel-scaled power (energy-squared) spectrogram
S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)

# covert to log scale (dB). Using peak power (max) as reference.
log_S = librosa.power_to_db(S, ref=np.max)

# Prepare the graph
plt.figure(figsize=(20,8))

# Display spectrogram on a mel scale
# smaple rate and hop length parameters are used to render the time axis
plt.subplot(2,1,2)
librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
plt.savefig('output_images/source1_melSpectrogram.png', dpi=100, bbox_inches='tight')

# Give it a title
plt.title(source1_audio_path + " mel power spectrogram")

# draw a colour bar
plt.colorbar(format='%+02.0f dB')

# Display a waveplot, breaking out harominic and purcussive components
# Same as above but with fewer comments now that we are experiecned 
plt.subplot(2,1,1) # displays first
y_harm, y_perc = librosa.effects.hpss(y)
librosa.display.waveplot(y=y_harm, sr=sr, color='purple', alpha=0.25)
librosa.display.waveplot(y=y_perc, sr=sr, color='r', alpha=0.25)
plt.tick_params(labelbottom=False) # remove axis label (same as spectrogram)
plt.xlabel("", labelpad=10)
plt.colorbar().ax.set_visible(False) # create and hide a colorbar to keep size the same
plt.title(source1_audio_path + ' Harmonic + Percussive Waveplot')

# Plot
plt.tight_layout()

Source 2 Track

Open Source2, get some basic statistics and create a player

In [85]:
# Load source2 track
y2, sr2 = librosa.load(source2_audio_path)

# Get track duration
source2_duration = librosa.get_duration(y=y2, sr=sr2)

# Get a tuning estimate
# estimate_tuning: Estimate the tuning of an audio time series or spectrogram input
source2_tuning = librosa.estimate_tuning(y=y2, sr=sr2)

print("File: {} \nDuration: {:2.4f} sec".format(
    source2_audio_path, source2_duration))
print("Tuning estimate: {}".format(source2_tuning))


# Play back original track
IPython.display.Audio(data=y2, rate=sr2)
File: source/Rosenthal_1930.wav 
Duration: 243.1334 sec
Tuning estimate: 0.19000000000000006
Out[85]:

Let's take a first look at the file

In [86]:
# mel-scaled power (energy-squared) spectrogram
S2 = librosa.feature.melspectrogram(y2, sr=sr2, n_mels=128)

# covert to log scale (dB). Using peak power (max) as reference.
log_S = librosa.power_to_db(S2, ref=np.max)

# Prepare the graph
plt.figure(figsize=(20,8))

# Display spectrogram on a mel scale
plt.subplot(2,1,2)
librosa.display.specshow(log_S, sr=sr2, x_axis='time', y_axis='mel')
plt.savefig('output_images/source2_melSpectrogram.png', dpi=100, bbox_inches='tight')
plt.title(source2_audio_path + " mel power spectrogram")
plt.colorbar(format='%+02.0f dB')

# Display a waveplot, breaking out harominic and purcussive components
plt.subplot(2,1,1) # displays first
y_harm, y_perc = librosa.effects.hpss(y2)
librosa.display.waveplot(y=y_harm, sr=sr2, color='purple', alpha=0.25)
librosa.display.waveplot(y=y_perc, sr=sr2, color='r', alpha=0.25)
plt.tick_params(labelbottom=False) # remove axis label (same as spectrogram)
plt.xlabel("", labelpad=10)
plt.colorbar().ax.set_visible(False) # create and hide a colorbar to keep size the same
plt.title(source2_audio_path + ' Harmonic + Percussive Waveplot')

# Plot
plt.tight_layout()

Enhanced chroma and chroma variants

Enhanced chroma and chroma variants

Original

In [87]:
chroma_orig = librosa.feature.chroma_stft(y=y, sr=sr)

# And for comparison, we'll show the CQT matrix as well.
C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=12*3, n_bins=7*12*3))

plt.figure(figsize=(20,4))
plt.subplot(2, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
                         y_axis='cqt_note', bins_per_octave=12*3)
plt.colorbar()
plt.subplot(2, 1, 2)
librosa.display.specshow(chroma_orig, y_axis='chroma')
plt.colorbar()
plt.ylabel('Original')
plt.tight_layout()

Correct Tuning Deviations

In [88]:
chroma_os = librosa.feature.chroma_cqt(y=y, sr=sr, bins_per_octave=12*3)


plt.figure(figsize=(20, 4))

plt.subplot(2, 1, 1)
librosa.display.specshow(chroma_orig, y_axis='chroma')
plt.colorbar()
plt.ylabel('Original')


plt.subplot(2, 1, 2)
librosa.display.specshow(chroma_os, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.ylabel('3x-over')
plt.tight_layout()

Isolate harmonic component

In [89]:
y_harm = librosa.effects.harmonic(y=y, margin=8)
chroma_os_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr, bins_per_octave=12*3)


plt.figure(figsize=(20, 8))

plt.subplot(2, 1, 1)
librosa.display.specshow(chroma_os, y_axis='chroma')
plt.colorbar()
plt.ylabel('3x-over')

plt.subplot(2, 1, 2)
librosa.display.specshow(chroma_os_harm, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.ylabel('Harmonic')
plt.tight_layout()

Non-local filtering

In [90]:
chroma_filter = np.minimum(chroma_os_harm,
                           librosa.decompose.nn_filter(chroma_os_harm,
                                                       aggregate=np.median,
                                                       metric='cosine'))


plt.figure(figsize=(20, 8))

plt.subplot(2, 1, 1)
librosa.display.specshow(chroma_os_harm, y_axis='chroma')
plt.colorbar()
plt.ylabel('Harmonic')

plt.subplot(2, 1, 2)
librosa.display.specshow(chroma_filter, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.ylabel('Non-local')
plt.tight_layout()

Horizontal Median Filter

In [91]:
chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))


plt.figure(figsize=(20, 8))

plt.subplot(2, 1, 1)
librosa.display.specshow(chroma_filter, y_axis='chroma')
plt.colorbar()
plt.ylabel('Non-local')

plt.subplot(2, 1, 2)
librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.ylabel('Median-filtered')
plt.tight_layout()

Before and After

In [92]:
plt.figure(figsize=(20, 8))
plt.subplot(3, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
                         y_axis='cqt_note', bins_per_octave=12*3)
plt.colorbar()
plt.ylabel('CQT')
plt.subplot(3, 1, 2)
librosa.display.specshow(chroma_orig, y_axis='chroma')
plt.ylabel('Original')
plt.colorbar()
plt.subplot(3, 1, 3)
librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time')
plt.ylabel('Processed')
plt.colorbar()
plt.tight_layout()
plt.show()

Applying chroma enchancement techniques to source files

Source1

In [93]:
# chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)
# chroma_os = librosa.feature.chroma_cqt(y=y, sr=sr, bins_per_octave=12*3)

y_harm = librosa.effects.harmonic(y=y, margin=8)
chroma_os_harm = librosa.feature.chroma_cqt(y=y_harm, sr=sr, bins_per_octave=12*3)

chroma_filter = np.minimum(chroma_os_harm,
                           librosa.decompose.nn_filter(chroma_os_harm,
                                                       aggregate=np.median,
                                                       metric='cosine'))

chroma_smooth = scipy.ndimage.median_filter(chroma_filter, size=(1, 9))

plt.figure(figsize=(20, 8))
plt.subplot(3, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
                         y_axis='cqt_note', bins_per_octave=12*3)
plt.colorbar()
plt.ylabel('CQT')
plt.subplot(3, 1, 2)
librosa.display.specshow(chroma_orig, y_axis='chroma')
plt.ylabel('Original')
plt.colorbar()
plt.subplot(3, 1, 3)
librosa.display.specshow(chroma_smooth, y_axis='chroma', x_axis='time')
plt.ylabel('Processed')
plt.colorbar()
plt.tight_layout()
plt.show()

Source2

In [94]:
chroma_orig_2 = librosa.feature.chroma_cqt(y=y2, sr=sr2)

# And for comparison, we'll show the CQT matrix as well.
C2 = np.abs(librosa.cqt(y=y2, sr=sr2, bins_per_octave=12*3, n_bins=7*12*3))

y_harm_2 = librosa.effects.harmonic(y=y2, margin=8)
chroma_os_harm_2 = librosa.feature.chroma_cqt(
    y=y_harm_2, sr=sr2, bins_per_octave=12*3)

chroma_filter_2 = np.minimum(chroma_os_harm_2,
                             librosa.decompose.nn_filter(chroma_os_harm_2,
                                                         aggregate=np.median,
                                                         metric='cosine'))

chroma_smooth_2 = scipy.ndimage.median_filter(chroma_filter_2, size=(1, 9))

plt.figure(figsize=(20, 8))
plt.subplot(3, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(C2, ref=np.max),
                         y_axis='cqt_note', bins_per_octave=12*3)
plt.colorbar()
plt.ylabel('CQT')
plt.subplot(3, 1, 2)
librosa.display.specshow(chroma_orig_2, y_axis='chroma')
plt.ylabel('Original')
plt.colorbar()
plt.subplot(3, 1, 3)
librosa.display.specshow(chroma_smooth_2, y_axis='chroma', x_axis='time')
plt.ylabel('Processed')
plt.colorbar()

plt.tight_layout()
plt.show()

Output comparisions for testing

In [95]:
plt.figure(figsize=(12,4))
librosa.display.specshow(chroma_smooth)
plt.savefig('output_images/source1_chromagram.png', dpi=100, bbox_inches='tight')
plt.title("Source1")
plt.tight_layout()
plt.show()
plt.figure(figsize=(12,4))
librosa.display.specshow(chroma_smooth_2)
plt.savefig('output_images/source2_chromagram.png', dpi=100, bbox_inches='tight')
plt.title("Source2")
plt.tight_layout()
plt.show()

Simple ImageDiff

Reference: python-pil-how-to-compare-two-images

Run imageDiff

In [70]:
# %run -i imageDiff/image_diff.py --first ../visualMIDIcompare/output_images/source1_chromagram.png --second ../visualMIDIcompare/output_images/source2_chromagram.png
In [184]:
## import the necessary packages
from skimage.measure import compare_ssim
import imutils
import cv2

# load the two input images
imageA = cv2.imread("../visualMIDIcompare/output_images/source1_chromagram.png")
imageB = cv2.imread("../visualMIDIcompare/output_images/source2_chromagram.png")

# convert the images to grayscale
grayA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
grayB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)


# compute the Structural Similiarity Index (SSM) between the two
# images, ensuring that the difference image is returned
# score can fall betweeen [-1, 1] with 1 being a perfect match
(score, diff) = compare_ssim(grayA, grayB, full=True)
diff = (diff * 255).astype("uint8")
print("SSIM: {}".format(score))

# threshold the difference image, followed by finding countours to
# obtain the regiions of the two input images that differ
thresh = cv2.threshold(diff, 0, 255,
                       cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,
                        cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)

# loop over the contours
for c in cnts:
    # compute the bounding box of hte contour and then draw the
    # bounding box on both input to represent where the two
    # images differ
    (x, y, w, h) = cv2.boundingRect(c)
    cv2.rectangle(imageA, (x, y), (x + w, y + h), (0, 50, 255), 2)
    cv2.rectangle(imageB, (x, y), (x + w, y + h), (0, 50, 255), 2)

# show the output images
plt.figure(figsize=(18, 8))

plt.subplot(2, 2, 1)
plt.tick_params(bottom= False, left=False, labelleft=False, labelbottom=False)
plt.imshow(cv2.cvtColor(imageA, cv2.COLOR_BGR2RGB))
# plt.savefig("output_images/{}_diff.png".format(source1_audio_fn), dpi=100, bbox_inches='tight')
plt.title("Source1")

plt.subplot(2, 2, 2)
plt.tick_params(bottom= False, left=False, labelleft=False, labelbottom=False)
plt.imshow(cv2.cvtColor(imageB, cv2.COLOR_BGR2RGB))
# plt.savefig("output_images/{}_diff.png".format(source2_audio_fn), dpi=100, bbox_inches='tight')
plt.title("Source2")

plt.savefig("output_images/{}_{}_diff.png".format(source1_audio_fn, source2_audio_fn), dpi=100, bbox_inches='tight')

plt.subplot(2, 2, 3)
plt.tick_params(bottom= False, left=False, labelleft=False, labelbottom=False)
plt.imshow(cv2.cvtColor(diff, cv2.COLOR_BGR2RGB))
plt.title("Diff")

plt.subplot(2, 2, 4)
plt.tick_params(bottom= False, left=False, labelleft=False, labelbottom=False)
plt.imshow(cv2.cvtColor(thresh, cv2.COLOR_BGR2RGB))
plt.title("Threshold")

plt.tight_layout()
SSIM: 0.2895809881272474
In [ ]: